{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A1  nfu.edu.cn \n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/xxyw/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_文学与传媒学院.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_文学与传媒学院.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae92f7535a7d60c9.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04becfaf48b3138e20.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/f0002a2424f34ad8b258adb1d07ca28b.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a2cdc278fc884ea.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d241e79365b6290.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5d2644acb6c552c.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88d3003f1cd99e59.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6ddf5569f73e5c1.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7f4ae65395910ff.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9417b800a8b2e3f.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d080d50d1113be1.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/59bda093ced440f78c638ade40ab0b93.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/1af5590575b74762b624f048b5ad79f4.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/4e32521de0da4d21979182e1b114a964.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/23279088871e4b89b8eab2e7fbc77b17.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/a5de3999469447b488857144f58f8c27.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0af15b9878f1d2c.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d8136478ec97e3635.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf76617654b5ca55.htm'},\n",
       " {'https://www.nfu.edu.cn/xxyw/f28729353ff749b9b170825ffe346949.htm'}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_URL=[i.absolute_links for i in r.html.xpath('//div[@class=\"news_title\"]/a')]\n",
    "list_URL\n",
    "# 绝对路径，可直接得出链接，不需要再去拼接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 解析\n",
    "# parsed = requests_html.soup_parse(html_load)\n",
    "# parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/xxyw/index19.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 重组链接\n",
    "# list_URL  = [urllib.parse.urlunparse\\\n",
    "# ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "# for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "# list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>我校承办首届 “新时代从商培养工程”</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>专注当下，冲刺高考，奋斗出最美的青春</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>广东工业大学华立学院来访我校</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>“疫情下的中国、美国以及中美关系”高层论坛暨广州南方学院“美国研究中心”成立五周年纪念研讨会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>“思想政治理论第一课”上，学校党委书记、校长讲了这些！</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6...</td>\n",
       "      <td>2021-03-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>学生报道|传承红色基因，讲好党员故事：我校举办身边优秀党员事迹分享会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7...</td>\n",
       "      <td>2021-03-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>权威发布：我校名列2021年中国民办高校第四</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9...</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>喜讯：电气学院学子在国际顶尖期刊发表学术论文</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d...</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>我校召开行政人员专题培训暨2021年春季学期第一次办公室工作例会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/59bda093ced440f78c...</td>\n",
       "      <td>2021-03-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>我校开展2021年春季学期教学检查工作</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/1af5590575b74762b6...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>媒体报道我校入选国家级一流本科专业建设点情况</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/4e32521de0da4d2197...</td>\n",
       "      <td>2021-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>我校举行转设更名挂牌仪式</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/23279088871e4b89b8...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>广东省教育厅公布国家级一流专业建设点数量排名：我校位列广东高校第29，同类院校第1</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/a5de3999469447b488...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>我校政商研究院学生胡志翔在国际期刊发表论文</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>我校举行大一学生升旗仪式暨晨跑之星颁奖大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d81...</td>\n",
       "      <td>2021-03-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>我校召开党史学习教育动员大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf...</td>\n",
       "      <td>2021-03-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>我校组织参加全省教育系统党史学习教育动员部署会视频会议</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f28729353ff749b9b1...</td>\n",
       "      <td>2021-03-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>喜讯：我校计算机科学与技术专业获IEET认证</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f25bba2bf25d43399a...</td>\n",
       "      <td>2021-03-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>我校师生深入学习习近平总书记在政协医药卫生界教育界联组会上的重要讲话精神</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/d6a77d315f9844618e...</td>\n",
       "      <td>2021-03-08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                标题  \\\n",
       "0                  我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会   \n",
       "1                               我校承办首届 “新时代从商培养工程”   \n",
       "2                               专注当下，冲刺高考，奋斗出最美的青春   \n",
       "3                                   广东工业大学华立学院来访我校   \n",
       "4   “疫情下的中国、美国以及中美关系”高层论坛暨广州南方学院“美国研究中心”成立五周年纪念研讨会   \n",
       "5                      “思想政治理论第一课”上，学校党委书记、校长讲了这些！   \n",
       "6               学生报道|传承红色基因，讲好党员故事：我校举办身边优秀党员事迹分享会   \n",
       "7                           权威发布：我校名列2021年中国民办高校第四   \n",
       "8                           喜讯：电气学院学子在国际顶尖期刊发表学术论文   \n",
       "9                 我校召开行政人员专题培训暨2021年春季学期第一次办公室工作例会   \n",
       "10                             我校开展2021年春季学期教学检查工作   \n",
       "11                          媒体报道我校入选国家级一流本科专业建设点情况   \n",
       "12                                    我校举行转设更名挂牌仪式   \n",
       "13       广东省教育厅公布国家级一流专业建设点数量排名：我校位列广东高校第29，同类院校第1   \n",
       "14                           我校政商研究院学生胡志翔在国际期刊发表论文   \n",
       "15                           我校举行大一学生升旗仪式暨晨跑之星颁奖大会   \n",
       "16                                  我校召开党史学习教育动员大会   \n",
       "17                     我校组织参加全省教育系统党史学习教育动员部署会视频会议   \n",
       "18                          喜讯：我校计算机科学与技术专业获IEET认证   \n",
       "19            我校师生深入学习习近平总书记在政协医药卫生界教育界联组会上的重要讲话精神   \n",
       "\n",
       "                                                   链结          日期  \n",
       "0   https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...  2021-04-02  \n",
       "1   https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...  2021-04-02  \n",
       "2   https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...  2021-04-02  \n",
       "3   https://www.nfu.edu.cn/xxyw/debb2f222e024cbda5...  2021-03-31  \n",
       "4   https://www.nfu.edu.cn/xxyw/e5378134dbaf4b7b88...  2021-03-31  \n",
       "5   https://www.nfu.edu.cn/xxyw/7c865b16b203467ab6...  2021-03-29  \n",
       "6   https://www.nfu.edu.cn/xxyw/28b0ad0eee8149e6b7...  2021-03-29  \n",
       "7   https://www.nfu.edu.cn/xxyw/c48c33c8f744430eb9...  2021-03-26  \n",
       "8   https://www.nfu.edu.cn/xxyw/395b8e2ba5df47c59d...  2021-03-26  \n",
       "9   https://www.nfu.edu.cn/xxyw/59bda093ced440f78c...  2021-03-19  \n",
       "10  https://www.nfu.edu.cn/xxyw/1af5590575b74762b6...  2021-03-15  \n",
       "11  https://www.nfu.edu.cn/xxyw/4e32521de0da4d2197...  2021-03-15  \n",
       "12  https://www.nfu.edu.cn/xxyw/23279088871e4b89b8...  2021-03-12  \n",
       "13  https://www.nfu.edu.cn/xxyw/a5de3999469447b488...  2021-03-12  \n",
       "14  https://www.nfu.edu.cn/xxyw/6273fd9185b54b20a0...  2021-03-12  \n",
       "15  https://www.nfu.edu.cn/xxyw/a1f9ac1d39704e4d81...  2021-03-12  \n",
       "16  https://www.nfu.edu.cn/xxyw/c438a1ec6db5446faf...  2021-03-11  \n",
       "17  https://www.nfu.edu.cn/xxyw/f28729353ff749b9b1...  2021-03-09  \n",
       "18  https://www.nfu.edu.cn/xxyw/f25bba2bf25d43399a...  2021-03-08  \n",
       "19  https://www.nfu.edu.cn/xxyw/d6a77d315f9844618e...  2021-03-08  "
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# # 输出结果\n",
    "# # B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "# df = pd.DataFrame( {\n",
    "#          \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "#          \"链结\": list_URL,\n",
    "#          \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "#      } )\n",
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df.to_excel(\"data_out/xuexiaoyaowen.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/xxyw/index19.htm'"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_url_01 = r.url\n",
    "base_url_01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SplitResult(scheme='https', netloc='www.nfu.edu.cn', path='/xxyw/index19.htm', query='', fragment='')"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlsplit(base_url_01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/xxyw/index19.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 第一页\n",
       "0              https\n",
       "1     www.nfu.edu.cn\n",
       "2  /xxyw/index19.htm\n",
       "3                   \n",
       "4                   "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(urllib.parse.urlsplit(base_url_01)).rename({0:\"第一页\"},axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/xxyw/index2.htm'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第二页\n",
    "base_url_02 = session.get('https://www.nfu.edu.cn/xxyw/index2.htm').url\n",
    "base_url_02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "      <th>第二页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/xxyw/index19.htm</td>\n",
       "      <td>/xxyw/index2.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 第一页               第二页\n",
       "0              https             https\n",
       "1     www.nfu.edu.cn    www.nfu.edu.cn\n",
       "2  /xxyw/index19.htm  /xxyw/index2.htm\n",
       "3                                     \n",
       "4                                     "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['第二页'] = urllib.parse.urlsplit(base_url_02)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn//index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/index1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index2.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index3.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index5.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index6.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index7.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index8.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index10.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index11.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index12.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index13.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index14.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index15.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index16.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index18.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index19.htm']"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/xxyw/index'+str(i)+'.htm' for i in range(1,20)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xxyw/index.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index1.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index2.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index3.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index4.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index5.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index6.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index7.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index8.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index9.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index10.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index11.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index12.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index13.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index14.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index15.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index16.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index17.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index18.htm',\n",
       " 'https://www.nfu.edu.cn/xxyw/index19.htm']"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group.insert(0,'https://www.nfu.edu.cn/xxyw/index.htm')\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/xxyw/index.htm'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>我校召开高校教师职称评审 政策解读专题报告会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f0002a2424f34ad8b2...</td>\n",
       "      <td>2021-04-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>快！来为我校大学生国旗护卫队参赛点赞！</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae...</td>\n",
       "      <td>2021-04-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>我校承办首届 “新时代从商培养工程”</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>专注当下，冲刺高考，奋斗出最美的青春</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>236</th>\n",
       "      <td>16</td>\n",
       "      <td>我校与辅仁大学座谈会顺利召开</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/e011bf0725c943669b...</td>\n",
       "      <td>2019-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>234</th>\n",
       "      <td>14</td>\n",
       "      <td>建设美丽乡村，拥抱健康人生——平安广州宣讲团禁毒宣传活动走进我校</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/1cd4f279f597455483...</td>\n",
       "      <td>2019-03-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>237</th>\n",
       "      <td>17</td>\n",
       "      <td>任剑涛教授莅临我校开展“人工智能与权利拟制”学术讲座</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/d54a6097bdda46c483...</td>\n",
       "      <td>2019-03-14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>239</th>\n",
       "      <td>19</td>\n",
       "      <td>“校园新闻采写那些事儿”——我校校园记者系列培训第三讲顺利开展</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/942036331962458493...</td>\n",
       "      <td>2019-03-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>238</th>\n",
       "      <td>18</td>\n",
       "      <td>我校召开中层干部工作会议</td>\n",
       "      <td>https://www.nfu.edu.cn/xxyw/0b2a4990495c4980a7...</td>\n",
       "      <td>2019-03-08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>400 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                标题  \\\n",
       "2        2            我校召开高校教师职称评审 政策解读专题报告会   \n",
       "0        0               快！来为我校大学生国旗护卫队参赛点赞！   \n",
       "4        4                我校承办首届 “新时代从商培养工程”   \n",
       "1        1                专注当下，冲刺高考，奋斗出最美的青春   \n",
       "3        3   我校召开2021年一流专业、一流课程、教学成果奖申报工作推进会   \n",
       "..     ...                               ...   \n",
       "236     16                    我校与辅仁大学座谈会顺利召开   \n",
       "234     14  建设美丽乡村，拥抱健康人生——平安广州宣讲团禁毒宣传活动走进我校   \n",
       "237     17        任剑涛教授莅临我校开展“人工智能与权利拟制”学术讲座   \n",
       "239     19   “校园新闻采写那些事儿”——我校校园记者系列培训第三讲顺利开展   \n",
       "238     18                      我校召开中层干部工作会议   \n",
       "\n",
       "                                                    链结          日期  \n",
       "2    https://www.nfu.edu.cn/xxyw/f0002a2424f34ad8b2...  2021-04-10  \n",
       "0    https://www.nfu.edu.cn/xxyw/5b71d46d3b114859ae...  2021-04-09  \n",
       "4    https://www.nfu.edu.cn/xxyw/0d7bd841484a42a69d...  2021-04-02  \n",
       "1    https://www.nfu.edu.cn/xxyw/f9bcd8092b494a04be...  2021-04-02  \n",
       "3    https://www.nfu.edu.cn/xxyw/48b0929919ec4d2d9a...  2021-04-02  \n",
       "..                                                 ...         ...  \n",
       "236  https://www.nfu.edu.cn/xxyw/e011bf0725c943669b...  2019-03-15  \n",
       "234  https://www.nfu.edu.cn/xxyw/1cd4f279f597455483...  2019-03-15  \n",
       "237  https://www.nfu.edu.cn/xxyw/d54a6097bdda46c483...  2019-03-14  \n",
       "239  https://www.nfu.edu.cn/xxyw/942036331962458493...  2019-03-08  \n",
       "238  https://www.nfu.edu.cn/xxyw/0b2a4990495c4980a7...  2019-03-08  \n",
       "\n",
       "[400 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/xxyw/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/xxyw/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/学校要问.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='媒体报道')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
